{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install pytube --user\n",
    "!pip install requests\n",
    "!pip install pandas\n",
    "!pip install numpy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Testing video download"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "from pytube import YouTube\n",
    "import re\n",
    "\n",
    "def remove_special_characters(input_string):\n",
    "    # Using regex to keep only alphanumeric characters and spaces\n",
    "    clean_string = re.sub(r'[^a-zA-Z0-9\\s]', '', input_string)\n",
    "    return clean_string\n",
    "\n",
    "def download_video_info(video_url, output_directory='downloads'):\n",
    "    try:\n",
    "        # Create a YouTube object\n",
    "        yt = YouTube(video_url)\n",
    "\n",
    "        # Create a directory for downloads if it doesn't exist\n",
    "        if not os.path.exists(output_directory):\n",
    "            os.makedirs(output_directory)\n",
    "\n",
    "        # Get the highest resolution audio stream\n",
    "        audio_stream = yt.streams.filter(only_audio=True).first()\n",
    "\n",
    "        # Download the audio stream\n",
    "        \n",
    "        file_name = remove_special_characters(yt.title)\n",
    "        file_name = file_name.replace(\" \",\"_\")\n",
    "        file_name = file_name.replace(\"…\",\"_\")\n",
    "        file_name = file_name.replace(\",\",\"_\")\n",
    "        \n",
    "        audio_stream.download(output_directory,filename=f'{file_name}.wav')\n",
    "        audio_path = os.path.join(f\"{output_directory}/{file_name}.wav\")\n",
    "        print(f\"Downloading audio to {audio_path}...\")\n",
    "\n",
    "        # Collect video information\n",
    "        video_info = {\n",
    "            'title': yt.title,\n",
    "            'duration': yt.length,\n",
    "            'author': yt.author,\n",
    "            'views': yt.views,\n",
    "            'description': yt.description,\n",
    "            'audio_path': audio_path\n",
    "        }\n",
    "\n",
    "        return video_info\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"An error occurred: {str(e)}\")\n",
    "        return None\n",
    "\n",
    "def process_video_links(file_path):\n",
    "    with open(file_path, 'r') as file:\n",
    "        video_links = file.readlines()\n",
    "\n",
    "    video_data_list = []\n",
    "    \n",
    "    # video_links = [\"https://www.youtube.com/watch?v=CgruI1RjH_c\"]\n",
    "\n",
    "    for video_link in video_links:\n",
    "        video_link = video_link.strip()\n",
    "        video_info = download_video_info(video_link)\n",
    "        \n",
    "        if video_info:\n",
    "            video_data_list.append(video_info)\n",
    "\n",
    "    # Save video data to a JSON file\n",
    "    output_json_path = 'video_data.json'\n",
    "    with open(output_json_path, 'w') as json_file:\n",
    "        json.dump(video_data_list, json_file, indent=2)\n",
    "\n",
    "    print(f'Video data saved to {output_json_path}')\n",
    "\n",
    "# Replace 'YOUR_TEXT_FILE_PATH' with the path to your text file containing video links\n",
    "text_file_path = \"./downloads/Fireship_clone/@Fireship-shorts.txt\"\n",
    "\n",
    "# Process video links and save data to JSON\n",
    "process_video_links(text_file_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Testing Audio Transcription api"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from deepgram import DeepgramClient, PrerecordedOptions\n",
    "\n",
    "def transcribe_audio(audio_file_path):\n",
    "\n",
    "    # Your Deepgram API Key\n",
    "    DEEPGRAM_API_KEY = ''\n",
    "\n",
    "    # Initialize the Deepgram SDK\n",
    "    deepgram = DeepgramClient(DEEPGRAM_API_KEY)\n",
    "\n",
    "    # Call the transcribe_file method on the prerecorded class\n",
    "    with open(audio_file_path, \"rb\") as file:\n",
    "        buffer_data = file.read()\n",
    "\n",
    "    payload = {\n",
    "        \"buffer\": buffer_data,\n",
    "    }\n",
    "\n",
    "    options = PrerecordedOptions(\n",
    "        model=\"nova-2\",\n",
    "        language=\"en\",\n",
    "        smart_format=True,\n",
    "        punctuate=True,\n",
    "        paragraphs=True,\n",
    "        diarize=True,\n",
    "        summarize=\"v2\",\n",
    "        detect_topics=True,\n",
    "        filler_words=True,\n",
    "    )\n",
    "\n",
    "    file_response = deepgram.listen.prerecorded.v(\"1\").transcribe_file(payload, options)\n",
    "    file_response = file_response.to_json()\n",
    "\n",
    "    json_final = json.loads(file_response)\n",
    "\n",
    "    with open(f\"test.json\", \"w\") as file:\n",
    "        json.dump(json_final, file, indent=4)\n",
    "        \n",
    "    return json_final\n",
    "    \n",
    "\n",
    "# # Example usage:\n",
    "# audio_file_path = \"./downloads/Fireship_clone/100+_Computer_Science_Concepts_Explained.wav\"\n",
    "# transcribe_audio(audio_file_path)\n",
    "# print(\"Transcribing completed successfully\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Download Youtube video transcribe it and save the transcribe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "from pytube import YouTube\n",
    "from tqdm import tqdm\n",
    "from deepgram import DeepgramClient, PrerecordedOptions\n",
    "\n",
    "def download_and_transcribe_video(video_url, output_directory='downloads'):\n",
    "    try:\n",
    "        # Create a YouTube object\n",
    "        yt = YouTube(video_url)\n",
    "\n",
    "        # Create a directory for downloads if it doesn't exist\n",
    "        if not os.path.exists(output_directory):\n",
    "            os.makedirs(output_directory)\n",
    "\n",
    "        # Get the highest resolution audio stream\n",
    "        audio_stream = yt.streams.filter(only_audio=True).first()\n",
    "\n",
    "        # Download the audio stream with tqdm progress bar\n",
    "        file_name = remove_special_characters(yt.title)\n",
    "        file_name = file_name.replace(\" \",\"_\")\n",
    "        file_name = file_name.replace(\"…\",\"_\")\n",
    "        file_name = file_name.replace(\",\",\"_\")\n",
    "        audio_path = os.path.join(output_directory, f'{file_name}.wav')\n",
    "        print(f\"Downloading audio to {audio_path}...\")\n",
    "        # with tqdm(total=audio_stream.filesize, unit='B', unit_scale=True, desc=f'Downloading {file_name}') as bar:\n",
    "        #     def on_progress(chunk, _):\n",
    "        #         bar.update(len(chunk))\n",
    "\n",
    "        audio_stream.download(output_directory, filename=f'{file_name}.wav')\n",
    "\n",
    "        transcript = transcribe_audio(audio_path)\n",
    "\n",
    "        # Collect video information\n",
    "        video_info = {\n",
    "            'link': video_url,\n",
    "            'title': yt.title,\n",
    "            'duration': yt.length,\n",
    "            'author': yt.author,\n",
    "            'views': yt.views,\n",
    "            'description': yt.description,\n",
    "            'audio_path': audio_path,\n",
    "            'transcript': transcript\n",
    "        }\n",
    "        \n",
    "        save_transcript_to_json(video_info, f'{output_directory}/{file_name}_transcript.json')\n",
    "        append_transcript_to_json(video_info, f'final_json_transcript_final.json')\n",
    "\n",
    "        return video_info\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"An error occurred: {str(e)}\")\n",
    "        return None\n",
    "\n",
    "def save_transcript_to_json(transcript, json_path):\n",
    "    with open(json_path, 'w') as file:\n",
    "        json.dump(transcript, file, indent=4)\n",
    "    print(f'Transcript saved to {json_path}')\n",
    "\n",
    "def append_transcript_to_json(transcript, json_path):\n",
    "    # Create an empty list if the file doesn't exist yet\n",
    "    if not os.path.exists(json_path):\n",
    "        with open(json_path, 'w') as file:\n",
    "            json.dump([], file)\n",
    "\n",
    "    # Load existing data from the file\n",
    "    with open(json_path, 'r') as file:\n",
    "        data = json.load(file)\n",
    "\n",
    "    # Append the new transcript to the list\n",
    "    data.append(transcript)\n",
    "\n",
    "    # Save the updated list to the file\n",
    "    with open(json_path, 'w') as file:\n",
    "        json.dump(data, file, indent=4)\n",
    "\n",
    "    print(f'Transcript appended to {json_path}')\n",
    "\n",
    "def process_video_links(file_path):\n",
    "    with open(file_path, 'r') as file:\n",
    "        video_list = file.readlines()\n",
    "        \n",
    "    video_data_list = []\n",
    "    videos_to_process = [video_line.strip().split(\",\") for video_line in video_list if video_line.strip().endswith(',0')]\n",
    "\n",
    "    # for video_link in tqdm(video_list, desc='Processing videos', unit='video'):\n",
    "    # for idx, video_line in enumerate(tqdm(video_list, desc='Processing videos', unit='video')):\n",
    "    for video_link, progress in tqdm(videos_to_process, desc='Processing videos', unit='video'):\n",
    "        # video_link, progress = video_line.split(\",\")\n",
    "        if int(progress) == 0:\n",
    "            video_link = video_link.strip()\n",
    "            print(f'\\nDownloading and transcribing: {video_link}')\n",
    "            try:\n",
    "                video_info = download_and_transcribe_video(video_link)\n",
    "                video_data_list.append(video_info)\n",
    "                idx = video_list.index(f'{video_link},0\\n')\n",
    "                video_list[idx] = f'{video_link},1\\n'\n",
    "            except:\n",
    "                # save failed video links in a text file\n",
    "                print(f'\\nError processing video',video_link)\n",
    "                with open(\"logs_file.txt\", 'a') as log_file:\n",
    "                    log_file.write(video_link)\n",
    "        else:\n",
    "            print(\"Video already downloaded and processed\")\n",
    "            \n",
    "        with open(file_path, \"w\") as file:\n",
    "            file.writelines(video_list)\n",
    "\n",
    "# Replace 'YOUR_TEXT_FILE_PATH' with the path to your text file containing video links\n",
    "text_file_path = \"./downloads/Fireship_clone_2/@Fireship-videos-remaining.txt\"\n",
    "\n",
    "# Process video links and save data to JSON\n",
    "process_video_links(text_file_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Combined JSON data saved to combined.json\n"
     ]
    }
   ],
   "source": [
    "# combine all the json files into a single file\n",
    "\n",
    "import os\n",
    "import json\n",
    "\n",
    "def combine_json_files(directory_name, output_file='combined.json'):\n",
    "    combined_data = []\n",
    "\n",
    "    # Check if the directory exists\n",
    "    if not os.path.exists(directory_name) or not os.path.isdir(directory_name):\n",
    "        print(f\"Error: {directory_name} is not a valid directory.\")\n",
    "        return\n",
    "\n",
    "    # Loop through all files in the directory\n",
    "    for filename in os.listdir(directory_name):\n",
    "        file_path = os.path.join(directory_name, filename)\n",
    "\n",
    "        # Check if the file is a JSON file\n",
    "        if os.path.isfile(file_path) and filename.endswith('.json'):\n",
    "            with open(file_path, 'r') as file:\n",
    "                try:\n",
    "                    # Load JSON data from the file\n",
    "                    json_data = json.load(file)\n",
    "\n",
    "                    # Append the loaded data to the combined_data list\n",
    "                    combined_data.append(json_data)\n",
    "\n",
    "                except json.JSONDecodeError as e:\n",
    "                    print(f\"Error decoding JSON in file {filename}: {e}\")\n",
    "\n",
    "    # Write the combined_data to a new JSON file\n",
    "    with open(output_file, 'w') as output_file:\n",
    "        json.dump(combined_data, output_file, indent=2)\n",
    "\n",
    "    print(f\"Combined JSON data saved to {output_file.name}\")\n",
    "\n",
    "# Example usage:\n",
    "directory_name = './downloads/'\n",
    "combine_json_files(directory_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Huggingface Dataset prepraration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "dataset = load_dataset(\"json\", data_files=\"./combined.json\")\n",
    "# dataset2 = load_dataset(\"json\", data_files=\"./final_json_transcript_final.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['author', 'duration', 'description', 'transcript', 'audio_path', 'link', 'title', 'views'],\n",
       "        num_rows: 522\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "New links written to output.txt\n"
     ]
    }
   ],
   "source": [
    "# Code to see which videos links have failed to download and transcribe\n",
    "\n",
    "# Assuming you have the text file named 'input.txt' and the list of video links\n",
    "# named 'video_link_list'\n",
    "\n",
    "\n",
    "input_file_path = './downloads/Fireship_clone_2/@Fireship-videos.txt'\n",
    "output_file_path = 'output.txt'\n",
    "\n",
    "# Read the existing links from the text file\n",
    "with open(input_file_path, 'r') as file:\n",
    "    existing_links = [line.split(',')[0] for line in file]\n",
    "\n",
    "# Filter out the links that are not in video_link_list\n",
    "new_links = [link for link in existing_links if link not in dataset[\"train\"][\"link\"]]\n",
    "\n",
    "# Write the new links to the output file\n",
    "with open(output_file_path, 'w') as output_file:\n",
    "    for link in new_links:\n",
    "        output_file.write(f\"{link},0\\n\")\n",
    "\n",
    "print(f\"New links written to {output_file_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['author', 'duration', 'description', 'transcript', 'audio_path', 'link', 'title', 'views'],\n",
       "        num_rows: 522\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "df = pd.DataFrame(dataset['train'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>author</th>\n",
       "      <th>duration</th>\n",
       "      <th>description</th>\n",
       "      <th>transcript</th>\n",
       "      <th>audio_path</th>\n",
       "      <th>link</th>\n",
       "      <th>title</th>\n",
       "      <th>views</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>787</td>\n",
       "      <td>Learn the fundamentals of Computer Science wit...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\100+_Computer_Science_Concepts_Expla...</td>\n",
       "      <td>https://www.youtube.com/watch?v=-uleG_Vecis</td>\n",
       "      <td>100+ Computer Science Concepts Explained</td>\n",
       "      <td>2110216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>743</td>\n",
       "      <td>The ultimate 10 minute JavaScript course that ...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\100+_JavaScript_Concepts_you_Need_to...</td>\n",
       "      <td>https://www.youtube.com/watch?v=lkIFF4maKMU</td>\n",
       "      <td>100+ JavaScript Concepts you Need to Know</td>\n",
       "      <td>1642938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>798</td>\n",
       "      <td>WebDev 101 is a complete introduction into the...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\100+_Web_Development_Things_you_Shou...</td>\n",
       "      <td>https://www.youtube.com/watch?v=erEgovG9WBs</td>\n",
       "      <td>100+ Web Development Things you Should Know</td>\n",
       "      <td>1296840</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>1471</td>\n",
       "      <td>Top 100 Firebase Pro Tips 🔥💯. Optimize your ap...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\100_Firebase_Tips,_Tricks,_and_Screw...</td>\n",
       "      <td>https://www.youtube.com/watch?v=iWEgpdVSZyg</td>\n",
       "      <td>100 Firebase Tips, Tricks, and Screw-ups</td>\n",
       "      <td>177364</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>246</td>\n",
       "      <td>Google made a ton of exciting announcements at...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\10_crazy_announcements_from_Google_I...</td>\n",
       "      <td>https://www.youtube.com/watch?v=nmfRDRNjCnM</td>\n",
       "      <td>10 crazy announcements from Google I/O</td>\n",
       "      <td>968111</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     author  duration                                        description  \\\n",
       "0  Fireship       787  Learn the fundamentals of Computer Science wit...   \n",
       "1  Fireship       743  The ultimate 10 minute JavaScript course that ...   \n",
       "2  Fireship       798  WebDev 101 is a complete introduction into the...   \n",
       "3  Fireship      1471  Top 100 Firebase Pro Tips 🔥💯. Optimize your ap...   \n",
       "4  Fireship       246  Google made a ton of exciting announcements at...   \n",
       "\n",
       "                                          transcript  \\\n",
       "0  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "1  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "2  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "3  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "4  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "\n",
       "                                          audio_path  \\\n",
       "0  downloads\\100+_Computer_Science_Concepts_Expla...   \n",
       "1  downloads\\100+_JavaScript_Concepts_you_Need_to...   \n",
       "2  downloads\\100+_Web_Development_Things_you_Shou...   \n",
       "3  downloads\\100_Firebase_Tips,_Tricks,_and_Screw...   \n",
       "4  downloads\\10_crazy_announcements_from_Google_I...   \n",
       "\n",
       "                                          link  \\\n",
       "0  https://www.youtube.com/watch?v=-uleG_Vecis   \n",
       "1  https://www.youtube.com/watch?v=lkIFF4maKMU   \n",
       "2  https://www.youtube.com/watch?v=erEgovG9WBs   \n",
       "3  https://www.youtube.com/watch?v=iWEgpdVSZyg   \n",
       "4  https://www.youtube.com/watch?v=nmfRDRNjCnM   \n",
       "\n",
       "                                         title    views  \n",
       "0     100+ Computer Science Concepts Explained  2110216  \n",
       "1    100+ JavaScript Concepts you Need to Know  1642938  \n",
       "2  100+ Web Development Things you Should Know  1296840  \n",
       "3     100 Firebase Tips, Tricks, and Screw-ups   177364  \n",
       "4       10 crazy announcements from Google I/O   968111  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>author</th>\n",
       "      <th>duration</th>\n",
       "      <th>description</th>\n",
       "      <th>transcript_json</th>\n",
       "      <th>audio_path</th>\n",
       "      <th>link</th>\n",
       "      <th>title</th>\n",
       "      <th>views</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>787</td>\n",
       "      <td>Learn the fundamentals of Computer Science wit...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\100+_Computer_Science_Concepts_Expla...</td>\n",
       "      <td>https://www.youtube.com/watch?v=-uleG_Vecis</td>\n",
       "      <td>100+ Computer Science Concepts Explained</td>\n",
       "      <td>2110216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>743</td>\n",
       "      <td>The ultimate 10 minute JavaScript course that ...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\100+_JavaScript_Concepts_you_Need_to...</td>\n",
       "      <td>https://www.youtube.com/watch?v=lkIFF4maKMU</td>\n",
       "      <td>100+ JavaScript Concepts you Need to Know</td>\n",
       "      <td>1642938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>798</td>\n",
       "      <td>WebDev 101 is a complete introduction into the...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\100+_Web_Development_Things_you_Shou...</td>\n",
       "      <td>https://www.youtube.com/watch?v=erEgovG9WBs</td>\n",
       "      <td>100+ Web Development Things you Should Know</td>\n",
       "      <td>1296840</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>1471</td>\n",
       "      <td>Top 100 Firebase Pro Tips 🔥💯. Optimize your ap...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\100_Firebase_Tips,_Tricks,_and_Screw...</td>\n",
       "      <td>https://www.youtube.com/watch?v=iWEgpdVSZyg</td>\n",
       "      <td>100 Firebase Tips, Tricks, and Screw-ups</td>\n",
       "      <td>177364</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>246</td>\n",
       "      <td>Google made a ton of exciting announcements at...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\10_crazy_announcements_from_Google_I...</td>\n",
       "      <td>https://www.youtube.com/watch?v=nmfRDRNjCnM</td>\n",
       "      <td>10 crazy announcements from Google I/O</td>\n",
       "      <td>968111</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     author  duration                                        description  \\\n",
       "0  Fireship       787  Learn the fundamentals of Computer Science wit...   \n",
       "1  Fireship       743  The ultimate 10 minute JavaScript course that ...   \n",
       "2  Fireship       798  WebDev 101 is a complete introduction into the...   \n",
       "3  Fireship      1471  Top 100 Firebase Pro Tips 🔥💯. Optimize your ap...   \n",
       "4  Fireship       246  Google made a ton of exciting announcements at...   \n",
       "\n",
       "                                     transcript_json  \\\n",
       "0  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "1  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "2  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "3  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "4  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "\n",
       "                                          audio_path  \\\n",
       "0  downloads\\100+_Computer_Science_Concepts_Expla...   \n",
       "1  downloads\\100+_JavaScript_Concepts_you_Need_to...   \n",
       "2  downloads\\100+_Web_Development_Things_you_Shou...   \n",
       "3  downloads\\100_Firebase_Tips,_Tricks,_and_Screw...   \n",
       "4  downloads\\10_crazy_announcements_from_Google_I...   \n",
       "\n",
       "                                          link  \\\n",
       "0  https://www.youtube.com/watch?v=-uleG_Vecis   \n",
       "1  https://www.youtube.com/watch?v=lkIFF4maKMU   \n",
       "2  https://www.youtube.com/watch?v=erEgovG9WBs   \n",
       "3  https://www.youtube.com/watch?v=iWEgpdVSZyg   \n",
       "4  https://www.youtube.com/watch?v=nmfRDRNjCnM   \n",
       "\n",
       "                                         title    views  \n",
       "0     100+ Computer Science Concepts Explained  2110216  \n",
       "1    100+ JavaScript Concepts you Need to Know  1642938  \n",
       "2  100+ Web Development Things you Should Know  1296840  \n",
       "3     100 Firebase Tips, Tricks, and Screw-ups   177364  \n",
       "4       10 crazy announcements from Google I/O   968111  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.rename(columns={'transcript': 'transcript_json'}, inplace=True)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "What's the first thing you should do when your code throws an error? Obviously, you should change nothing and try to run it again a few times. If that doesn't work, you're gonna need a computer science degree. The awesome thing about software engineering is that you can learn to code and get a high paying job, while literally having no idea how anything actually works. It all just feels like magic. Like a pilot driving a giant metal tube in the sky while knowing nothing about aerodynamics. Mother of God, no. Holy shit. Shit. Welcome to computer science 101. In today's video, you'll learn the science behind the garbage code you've been writing by learning 101 different computer science terms and concepts. This is a computer. It's just a piece of tape that holds ones and zeros along with a device that can read and write to it. It's called a Turing machine and in theory, it can compute anything, like the graphics in this video or the algorithm that recommended that you watch it. At the core of modern computers, we have the central processing unit. If we crack rack it open, we find a piece of silicon that contains billions of tiny transistors, which are like microscopic on off switches. The value at one of these switches is called a bit end is the smallest piece of information a computer can use. However, 1 bit by itself is not very useful, so they come in a package of 8 called a byte. 1 byte can represent 250 6 different values, like all the characters that you type on your keyboard. In fact, when you type into your keyboard, the character produced is actually mapped to a binary value in a character encoding like ASCII or utf8 binary is just a system for counting, like the base ten system you normally use when counting seeing on your fingers, but it only has 2 characters, 1 and 0. Humans have a hard time reading binary, so most often it's represented in a hexadecimal base 16 format, where ten numbers and 6 letters can represent a 4 bit group called a nibble. As a developer, when you write code in a programming language, it will actually be converted into machine code, which is a binary format that can be decoded and executed by the CPU. What it doesn't do though is store data for your applications. For that, computers have random access memory or RAM. It's like a neighborhood, and inside of every house lives a byte. Every location has a memory address, which the CPU can read and write too. You can think of the CPU and RAM as the brain of the computer. But in order for a computer to be useful, it needs to handle input and output. An input device might be the keyboard and mouse, while an output device might be your monitor. Luckily, most developers don't need to worry about how this hardware fits together because we have operating system kernels, like Linux, Mac, and Windows that control all hardware resources via device drivers. Now, to start hacking on the operating system, your first entry point is the shell, which is a program that is the operating system to the end user. It's called a shell because it wraps the kernel. It takes a line of text as input and produces an output. At this is called a command line interface. Not only can it connect to your own computer, but with the secure shell protocol, it can also connect to remote computers a network. Now that you have access to the mainframe, it's time to pick a programming language, which is a tool that uses the abstraction principle to make computers practical to work with for humans by simplifying different systems layer by layer. Some languages like Python are interpreted. That means there's a program called an interpreter that will execute each line of code 1 by 1. Other languages like c plus plus are compiled. They use a compiler to convert the entire program into machine code in advance before the CPU attempts to execute it. This results in an executable file that can be run by the operating system without any extra dependencies. Now every every programming language has a variety of built in data types to represent the data we're working with in our code. Instead of bytes, we work with more human friendly things select characters and numbers. Now, the most fundamental way to use data in your application is to declare a variable. This attaches a name to a data point, allowing you to reuse it somewhere else in your code. Python is a dynamically typed language, which means we don't need to tell the program exactly which data type is assigned to a variable. It just figures it out automatically. However, other languages like C are statically typed, and that means you need to specify the data type of a variable in your code. When you define a variable, its value is stored somewhere in memory on the hardware, and you may need to allocate and free up memory already throughout the program. A pointer is a variable whose value is the memory address of another variable, which can be used for low level memory control. Many languages don't want to deal with low level memory management and instead implement a garbage collector, which automatically allocates and deallocates memory when an object is no longer referenced in the program. Carpet day. No. Now, the data types available are different in every programming language, but typically you'll find int to represent whole numbers, switch may or may not be signed or unsigned to represent negative numbers as well. When numbers require a decimal point, they typically use the floating point type. It's called a float because there's only enough memory to represent a certain range of numbers at a certain precision, and is basically a form of scientific notation to make computers faster. If you need more range or precision, many languages also have a double that doubles the amount of memory used for the number. Now when it comes to characters, you'll typically find the char data type to represent a single character or more commonly a string to represent multiple characters together. Ultimately, these characters triggers get stored in a memory address somewhere, but they need to be stored in a certain order. When the order starts with the most significant byte and the smallest memory address, it's called big endian or vice versa, if the least significant byte is stored in the smallest address, it's called little endian. When it comes to practical software for engineering. One of the most fundamental things we do is organize data into data structures. The most useful data structure is probably the array or list. Just like a shopping list. It organizes multiple data points and order. However, it also maintains an index of integers that starts at 0 and goes up for every new item in the list. That can be useful, but you don't actually need an index to create a list of items. Another option is a link list where each item has a pointer to the next item in front of Another option is a stack that follows the last in first out principle. It's like stacking a set of plates, then when you want to access data, you pop the last one off the top. The inverse option is a q, which is first in first out. Just like when you get into the breadline, the first person there is the first one to be fed. Now, another extremely useful data structure is the hash, which might also be called a map or dictionary. It's like an array, but instead of an index of integers, you define the keys that point to each individual item, giving you a collection of key value pairs. In many cases though, it's not efficient to organize data in a linear way. To address that problem, we have trees, which organize nodes together in a hierarchy that can often be traversed more quickly. This can sometimes be too rigid of data structure though. So instead, a graph can be created to connect multiple nodes together in a virtually unlimited number of ways. A graph has a node for the data and an edge for the relationship between the data points. Data structures are essential, but they don't do anything by themselves. To do something useful, you'll need to code up an algorithm, which is just code that solves a problem. I took the initiative in creating the Internet. In our code, we have several mechanisms for implementing algorithms. The most fundamental of which is a function, which is a block of code that takes an input then does something and returns an output. Like a variable, a function has a name and it can called from other parts of your code with different input parameters called arguments. One thing you might do in the function body is compare one value to another. Every language has a variety of built in operators like equality, greater than, and less than that you can use to compare 2 values. If a is greater than b, then it forms a value of true, but if b is greater than a, then the value is false. True false is what's known as a boolean data type and whenever your code produces value like this, it's known as an expression, but not all code will produce a value. Sometimes your code will simply do something which is known as a statement. A good example is the if statement which handles conditional logic. For example, if the condition is true, it will execute this code, otherwise it will short circuit and run the code inside of the else block. Another very common type of statement is a loop. A while loop will run this block of code over and over again until the condition in the parenthesis becomes false. That can be useful, but more often than not, you'll want to loop over an iterable data type like an array. Most languages have a for loop that can run some code for every object in the array or iterable data structure. Now in some cases, a function may not have an output, which is generally called a void function. An interesting thing about functions is that they can call themselves. When a function calls itself, it's called recursion because when done like this by default, it will recurse forever creating an infinite loop. That happens because when you call a function, the programming language will put it into memory on what's known as the call stack, which is a short term chunk of memory for executing your code. When a function keeps calling itself, the language will keep pushing frames onto the call stack until you get a stack overflow error. To avoid this, your algorithm needs a base condition so it knows when to terminate the loop. Now, when you write an algorithm, you'll need to determine if it's any good, and the system for doing that is called big o notation. It's a standard format for approximating the performance have an algorithm at scale. It may reference time complexity, which is how fast your algorithm will run, and space complexity, which deals with how much memory is required to run it. Developers have many different algorithm types at their disposal. The most crude option is brute force, where you might loop over every possible combination to hack somebody's credit card pin. A more sophisticated approach might be divide and conquer, like binary search where you cut the problem in half multiple times until you find what you're looking for. Another option is dynamic programming algorithms, where a problem is broken down into multiple smaller sub problems and the result of each computation is stored for later use using a technique called memoization. That means if a function has already been called, it will use the existing value instead of recomputing it again from scratch. Then we have greedy algorithms that will make the choice that is most beneficial in the short term without considering the problem as a whole. One example of this is Dijkstra's shortest path algorithm. On the flip side, we have backtracking algorithms, which take a more incremental approach by looking at all the possible options, like a rat in a maze exploring all the different potential paths. Now, when it comes to implementing your code, there are always multiple ways to get the job done. One aiming paradigm is declarative, where your code describes what the program does and the outcome, but doesn't care about things like control flow. This style of programming is often associated with functional languages like Haskell. The other paradigm is imperative programming, where your code uses statements like if and while, providing explicit instructions about how to produce an outcome. It's associated with procedural languages like C. Today, most general purpose languages like Python, JavaScript, Hotline, Swift, and so on are multi paradigm, which means they support all these options at the same time, in addition to object oriented programming. The idea behind OOP is that you use classes to write a blueprint for the data or objects in your code. A class can encapsulate variables, which are commonly called properties, as well as functions, which are usually called methods in this context. It's a common way to organize and reuse code because classes can share behaviors between each other through inheritance, where a subclass can extend and override the behaviors of the parent class. And it opens the door to all kinds have other ideas called design patterns. Now, a class by itself doesn't actually do anything. Instead, it's used to instantiate objects, which are actual chunks of data that live in your computer's memory. Often, you'll want to reference the same object over and over again in your code. When data is long lived, it can't go in the call stack. Instead, most languages have a separate area of memory hold the heap, which unlike the call stack can grow and shrink based on how your application is used. It also allows you to pass objects by reference, which means you can use the same object in multiple variables without increasing the memory footprint because it always points to the same chunk of memory in the heap. Now, what's interesting is that if we go back to the CPU that we talked about in the beginning, you'll notice that it contains multiple threads. A thread takes the physical CPU core in breaks into virtual cores that allow it to run code simultaneously. There are some programming languages that support parallelism where you can write code that literally executes on 2 different threads at the same time. However, many languages out there are only single threaded, but that doesn't mean they can't do 2 things at the same time. Instead, they implement concurrency models like an event loop or coroutines that can pause or delay the normal execution of code to handle multiple job's on a single thread at the same time. Now, in modern computing, we're rarely working with the bare metal CPU and RAM. Instead, we work in the cloud with a virtual machine, which is just a piece set software that simulates hardware that allows us to take really big computers and split them up into a bunch of smaller virtual computers. These machines are the backbone of the Internet and are connected via the Internet protocol. Each machine has a unique IP address to identify it on the network. Work. That IP address is usually alias to a URL that is registered in a global database called the domain name service. Now to establish a connection, in. The 2 computers will perform a TCP handshake, which will allow them to exchange messages called packets. On top of that, there's usually security layer like SSL to encrypt and decrypt the messages over the network. Now the 2 computers can securely share data with the hypertext transfer for protocol. The client may request a web page, then the server will respond with some HTML. Modern servers provide a standardized way for a client to request data, which is called an application programming interface or API. The most common architecture is REST, where URLs are mapped to different data entities available on the server. And that brings us to our final topic, mother effing printers. You're gonna need to learn how these things work inside fighting out, because every time you go to grandma's house, she's going to ask you to fix it, which shouldn't be a problem for a computer scientist like you. Thanks for watching, and I will see you in the next one.\n"
     ]
    }
   ],
   "source": [
    "json_string = df.loc[0, 'transcript_json']\n",
    "\n",
    "# Display the loaded JSON object\n",
    "print(json_string[\"results\"][\"channels\"][0][\"alternatives\"][0][\"transcript\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "# Assuming your DataFrame is named df\n",
    "\n",
    "def parse_json(row):\n",
    "    try:\n",
    "        transcript_json = row['transcript_json']\n",
    "        if transcript_json[\"results\"][\"summary\"][\"result\"] == \"success\":\n",
    "            transcript = str(transcript_json[\"results\"][\"channels\"][0][\"alternatives\"][0][\"transcript\"])\n",
    "            summary = str(transcript_json[\"results\"][\"summary\"][\"short\"])\n",
    "            return transcript, summary\n",
    "        else:\n",
    "            print(\"an error occurred\")\n",
    "            return None, None\n",
    "    except (json.JSONDecodeError, KeyError):\n",
    "        print(\"an exception occurred\")\n",
    "        return None, None\n",
    "\n",
    "# Apply the custom function to each row\n",
    "df[['transcript', 'summary']] = df.apply(parse_json, axis=1, result_type='expand')\n",
    "\n",
    "# Display the updated DataFrame\n",
    "# print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "import pandas as pd\n",
    "final_dataset = Dataset.from_pandas(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['author', 'duration', 'description', 'transcript_json', 'audio_path', 'link', 'title', 'views', 'transcript', 'summary'],\n",
       "    num_rows: 522\n",
       "})"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_dataset.push_to_hub(\"Huggingface-userId/FS_transcribe_summary\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Prompt formatting to the following format\n",
    "```\n",
    "[INST]\n",
    "You are youtuber called {author} you make engaging high-intensity and entertaining coding tutorials and tech news. \n",
    "you covers a wide range of topics relevant to programmers, aiming to help them learn and improve their skills quickly.\n",
    "\n",
    "Given the title of the video : {title} \n",
    "and a small summary : {video_summary}\n",
    "[/INST]\n",
    "\n",
    "Generate the video : {video_transcript}\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>author</th>\n",
       "      <th>duration</th>\n",
       "      <th>description</th>\n",
       "      <th>transcript_json</th>\n",
       "      <th>audio_path</th>\n",
       "      <th>link</th>\n",
       "      <th>title</th>\n",
       "      <th>views</th>\n",
       "      <th>transcript</th>\n",
       "      <th>summary</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>787</td>\n",
       "      <td>Learn the fundamentals of Computer Science wit...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\100+_Computer_Science_Concepts_Expla...</td>\n",
       "      <td>https://www.youtube.com/watch?v=-uleG_Vecis</td>\n",
       "      <td>100+ Computer Science Concepts Explained</td>\n",
       "      <td>2110216</td>\n",
       "      <td>What's the first thing you should do when your...</td>\n",
       "      <td>The importance of hardware and memory for a co...</td>\n",
       "      <td>\\n        [INST]\\n        You are youtuber cal...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>743</td>\n",
       "      <td>The ultimate 10 minute JavaScript course that ...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\100+_JavaScript_Concepts_you_Need_to...</td>\n",
       "      <td>https://www.youtube.com/watch?v=lkIFF4maKMU</td>\n",
       "      <td>100+ JavaScript Concepts you Need to Know</td>\n",
       "      <td>1642938</td>\n",
       "      <td>JavaScript. It's a wonderful programming langu...</td>\n",
       "      <td>The speaker explains that JavaScript is a prog...</td>\n",
       "      <td>\\n        [INST]\\n        You are youtuber cal...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>798</td>\n",
       "      <td>WebDev 101 is a complete introduction into the...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\100+_Web_Development_Things_you_Shou...</td>\n",
       "      <td>https://www.youtube.com/watch?v=erEgovG9WBs</td>\n",
       "      <td>100+ Web Development Things you Should Know</td>\n",
       "      <td>1296840</td>\n",
       "      <td>Web development is the best job in the world. ...</td>\n",
       "      <td>The internet is a collection of machines conne...</td>\n",
       "      <td>\\n        [INST]\\n        You are youtuber cal...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>1471</td>\n",
       "      <td>Top 100 Firebase Pro Tips 🔥💯. Optimize your ap...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\100_Firebase_Tips,_Tricks,_and_Screw...</td>\n",
       "      <td>https://www.youtube.com/watch?v=iWEgpdVSZyg</td>\n",
       "      <td>100 Firebase Tips, Tricks, and Screw-ups</td>\n",
       "      <td>177364</td>\n",
       "      <td>Welcome to my top 10 Firebase tips. Welcome to...</td>\n",
       "      <td>The speakers discuss how to build successful r...</td>\n",
       "      <td>\\n        [INST]\\n        You are youtuber cal...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Fireship</td>\n",
       "      <td>246</td>\n",
       "      <td>Google made a ton of exciting announcements at...</td>\n",
       "      <td>{'metadata': {'channels': 1, 'created': '2024-...</td>\n",
       "      <td>downloads\\10_crazy_announcements_from_Google_I...</td>\n",
       "      <td>https://www.youtube.com/watch?v=nmfRDRNjCnM</td>\n",
       "      <td>10 crazy announcements from Google I/O</td>\n",
       "      <td>968111</td>\n",
       "      <td>It is May 11, 2023, and you're watching the Co...</td>\n",
       "      <td>In this video, the speakers discuss Google's u...</td>\n",
       "      <td>\\n        [INST]\\n        You are youtuber cal...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     author  duration                                        description  \\\n",
       "0  Fireship       787  Learn the fundamentals of Computer Science wit...   \n",
       "1  Fireship       743  The ultimate 10 minute JavaScript course that ...   \n",
       "2  Fireship       798  WebDev 101 is a complete introduction into the...   \n",
       "3  Fireship      1471  Top 100 Firebase Pro Tips 🔥💯. Optimize your ap...   \n",
       "4  Fireship       246  Google made a ton of exciting announcements at...   \n",
       "\n",
       "                                     transcript_json  \\\n",
       "0  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "1  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "2  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "3  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "4  {'metadata': {'channels': 1, 'created': '2024-...   \n",
       "\n",
       "                                          audio_path  \\\n",
       "0  downloads\\100+_Computer_Science_Concepts_Expla...   \n",
       "1  downloads\\100+_JavaScript_Concepts_you_Need_to...   \n",
       "2  downloads\\100+_Web_Development_Things_you_Shou...   \n",
       "3  downloads\\100_Firebase_Tips,_Tricks,_and_Screw...   \n",
       "4  downloads\\10_crazy_announcements_from_Google_I...   \n",
       "\n",
       "                                          link  \\\n",
       "0  https://www.youtube.com/watch?v=-uleG_Vecis   \n",
       "1  https://www.youtube.com/watch?v=lkIFF4maKMU   \n",
       "2  https://www.youtube.com/watch?v=erEgovG9WBs   \n",
       "3  https://www.youtube.com/watch?v=iWEgpdVSZyg   \n",
       "4  https://www.youtube.com/watch?v=nmfRDRNjCnM   \n",
       "\n",
       "                                         title    views  \\\n",
       "0     100+ Computer Science Concepts Explained  2110216   \n",
       "1    100+ JavaScript Concepts you Need to Know  1642938   \n",
       "2  100+ Web Development Things you Should Know  1296840   \n",
       "3     100 Firebase Tips, Tricks, and Screw-ups   177364   \n",
       "4       10 crazy announcements from Google I/O   968111   \n",
       "\n",
       "                                          transcript  \\\n",
       "0  What's the first thing you should do when your...   \n",
       "1  JavaScript. It's a wonderful programming langu...   \n",
       "2  Web development is the best job in the world. ...   \n",
       "3  Welcome to my top 10 Firebase tips. Welcome to...   \n",
       "4  It is May 11, 2023, and you're watching the Co...   \n",
       "\n",
       "                                             summary  \\\n",
       "0  The importance of hardware and memory for a co...   \n",
       "1  The speaker explains that JavaScript is a prog...   \n",
       "2  The internet is a collection of machines conne...   \n",
       "3  The speakers discuss how to build successful r...   \n",
       "4  In this video, the speakers discuss Google's u...   \n",
       "\n",
       "                                                text  \n",
       "0  \\n        [INST]\\n        You are youtuber cal...  \n",
       "1  \\n        [INST]\\n        You are youtuber cal...  \n",
       "2  \\n        [INST]\\n        You are youtuber cal...  \n",
       "3  \\n        [INST]\\n        You are youtuber cal...  \n",
       "4  \\n        [INST]\\n        You are youtuber cal...  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "# Assuming your DataFrame is named df\n",
    "\n",
    "def create_prompt(row):\n",
    "    try:\n",
    "        author = row[\"author\"]\n",
    "        title = row[\"title\"]\n",
    "        video_transcript = row[\"transcript\"]\n",
    "        video_summary = row[\"summary\"]\n",
    "        # transcript_json = row['transcript_json']\n",
    "        text = f\"\"\"\n",
    "        [INST]\n",
    "        You are youtuber called {author} you make engaging high-intensity and entertaining coding tutorials and tech news. \n",
    "        you covers a wide range of topics relevant to programmers, aiming to help them learn and improve their skills quickly.\n",
    "        \n",
    "        Given the title of the video : {title} \n",
    "        and a small summary : {video_summary}\n",
    "        [/INST]\n",
    "        \n",
    "        Generate the video : {video_transcript}\n",
    "        \"\"\"        \n",
    "        return text\n",
    "\n",
    "    except (json.JSONDecodeError, KeyError):\n",
    "        print(\"an exception occurred\")\n",
    "        return None\n",
    "\n",
    "# Apply the custom function to each row\n",
    "df['text'] = df.apply(create_prompt, axis=1, result_type='expand')\n",
    "\n",
    "# Display the updated DataFrame\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "import pandas as pd\n",
    "final_dataset = Dataset.from_pandas(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_dataset.push_to_hub(\"Huggingface-userId/FS_transcribe_summary_prompt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Parsing operation for the api response from deepgram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"dummy.json\",\"r\") as f:\n",
    "    transcribe_json_list = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"Have you ever woken up in the middle of the night in a panic wondering how to extract a polygonal mesh of an isosurface from a 3 dimensional discrete scalar field? Yeah. I didn't think so. But back in 87, 2 programmers at General Electric did. They created and patented the marching cubes algorithm, an algorithm that has likely saved countless lives by allowing doctors to visualize data from CT and MRI scans. Whenever you instruct a machine to solve a problem with code, you're creating an algorithm, a procedure for rearranging ones and zeros that can make animals talk and vacuums walk. Most algorithms belong in a dumpster, but some are fast, skin. Some are beautiful and some are so weird, they're indistinguishable from magic. Today, we'll look at 10 of the most interesting algorithms ever engineered sphere, and how they're used to solve very interesting problems in the real world. 1st on the list, we have wave function collapse. One of the weirdest things in all of science is the double slit experiment, where particles behave like a wave when you're not looking, but when you look, they suddenly collapse down to a particle. It seems counterintuitive, but it makes total sense when you realize we're living in a simulation and the universe wrote algorithm to cut down on its AWS build. It's an interesting concept to think about philosophically, but the general idea behind wave function collapse can also be implemented programmatically. Imagine we have a map for a video game, but what if this is a side scrolling game that can go on for eternity? We can't just make a bigger map, we need an algorithm to procedurally generate it on the fly. What's so weird is that we can take this initial map and think of it as being in the initial superposition of all possibilities. It's the wave function. Then upon observation, it collapses into a particle. Or in other words, it selects a random map tile but follows a consistent set of rules, like in this case, making sure that the roads are always connected, providing a random yet cohesive result and doesn't rely on any sort of modern generative AI. Speaking of which, AI is weird as hell. Diffusion is a machine learning algorithm originally developed at OpenAI and is the magic behind image generators like DALL E and Stable Diffusion. But the concept of diffusion actually comes from thermodynamics, where particles spread from areas of higher concentration to lower concentration. In artificial intelligence, the process is reversed. The algorithm starts by generating random noise, which would be like high entropy and thermodynamics, and gradually refines it to a structured image, which would be lower entropy. But first, you'll need to train model that can do this well. The diffusion algorithm works in 2 phases. In the forward phase, it gradually adds noise to an image step by step until it becomes completely random. In the second phase, the algorithm reverses this process, reconstructing it back into a coherent image. When the algorithm runs over millions of labeled images, we get a collection of weights that can be used to generate new images out of thin air, allowing us to build an infinite army OnlyFans models. It's highly compute intensive, but also works well on audio. And the next frontier is diffusion for video generation. But now Now let's talk about simulated annealing. One frustrating thing about programming is that for many problems, there's not just one solution, but many solutions. Like an Amazon warehouse has many different ways to organize its inventory, but some ways are more efficient than others. Annealing is a word that comes from metallurgy, where metals are skid heated up and cooled down over and over again to remove defects. The same idea is used in optimization algorithms to find the best answer or n a c of good answers. Imagine trying to find the highest point in a mountain range full of peaks and valleys. A simple hill climb algorithm won't work because there are many local peaks. Initially, the temperature sky allowing the algorithm to explore freely. As time goes on though, the temperature is lowered, which decreases the probability of accepting a worse solution. The off here is exploration versus exploitation. But the reason I included this algorithm is because it's also a good way for beginners to learn how to code. Initially, you start out exploring all kinds of different technologies and frameworks, then eventually you find one specific area to exploit and specialize in. But we can't talk about algorithms without talking about sorting. And the most ingenious sorting algorithm of all time is without a doubt, sleep sort. The majority of sane sorting algorithms out there use strategies like divide conquer to break up an array into subarrays where it can be sorted more efficiently. However, some random genius on 4 chan found a better way, but it's a bit unconventional. Here's what the code looks like in bash. It's incredibly simple. It loops over the array, and then for each element, it opens up a new thread that sleeps for the amount of time proportional to the value of its element. Then finally, after waking up, it prints that element. It's genius because it delegates the sorting to the CPU scheduler. It's also dumb and useless because it delegates sorting to the CPU scheduler. Speaking of which, you might be familiar with another useless sorting algorithm, BOGO sort, which tries to sort an array by randomly guessing over and over again. It's like playing the lottery. But what if we apply the same algorithm with quantum mechanics to the multiverse? If we're to trust multiverse science, we know that all possible outcomes exist in separate parallel universes. That means as a developer, if you find yourself with an unsorted array, there's or some other parallel universe where it is sorted. The technology isn't quite there yet, but if we could randomly observe these other universes to find the sorted array, we could then use a portal gun to travel to that universe, which would make our lives much easier. Although, we would obviously have to kill the version of ourself in that universe, but if it's a large array, quantum Bogosort might be worth it. That's purely hypothetical, but one of the most practical and goaded algorithms of all time is SCAR SA, a public key cryptosystem. It's essential for digital security, allowing people on the Internet to lock their mailboxes and sign their letters with a unique signature. But it's based on one simple mathematical reality. Multiplying large numbers to find 2 original large prime numbers is extremely difficult and time like it take your laptop 300,000,000,000,000 years to brute force. Unless quantum computers become a thing, and we can start leveraging Shor's algorithm, which can solve the integer factorization problem exponentially faster than any classical algorithm. Prime factoring is pretty simple, but how this algorithm does it is where things get weird. It relies on concepts like cubits, skits superposition and entanglement to perform massive amounts of calculations in parallel. The algorithm is legit, but so far, the biggest number ever factored is 21. Even IBM's state of the art Q System 1 fails when trying to factor the number 35. However, just recently, skin. The Chinese factored this big ass number with a quantum computer, but it uses a different algorithm that doesn't scale very well for large numbers unlike Shor's algorithm. Everything is safe for now, but when someone figures out how to make quantum computers work, expect all hell to break loose in the cybersecurity world. At the beginning of this video, I mentioned the marching cubes algorithm, but it deserves a closer look. So first, we start with a 3 d scalar field, which might represent data from an MRI machine. Each point in the 3 d space is represented by a single number or a scalar. The algorithm starts with a single point, then takes its 8 neighboring locations to form an cube, but treats the eight values as a bit in an 8 bit integer. This results in 256 different possibilities, which point to a precalculated array of polygons. The algorithm marches through each point to create a 3 d mesh that can be used in 3 d software. And at the time, this was really cool because as MRI machines produce slices of data that can now be rendered in 3 d. In modern times, though, we're often dealing with distributed systems in the cloud, And that brings us to the Byzantine generals problem. Imagine you're a general in the Byzantine army. You're camped around a city with a few other generals with plans to attack it the next morning. But what if one of the generals skitrunk and wakes up too hungover to attack. The entire system could collapse. Computers have the same problem. Sometimes they might fail or be infiltrated by hackers, and you never know when or where that's going to happen. Luckily, algorithms like PBFT are designed to solve this. They can keep a distributed network working properly even if up to 1 third of its nodes go haywire. It works by having a node broadcast a pre prepare message to other nodes, indicating its readiness to execute some code that will change the system. The other nodes will respond back in agreement. Then after a certain threshold, a consensus is formed. Once there's a consensus, The original node sends back a commit message to all the other nodes, which can then all execute the changes, making the entire state of the system consistent. Algorithms like this are essential for blockchain technology and things like distributed cloud databases. What's really cool about algorithms, though, is that they can also reflect nature, like Boyd's artificial life program. It was created back in 86 and simulates the flocking behavior of birds. What's so cool about it is that it demonstrates the emergent complexity or beauty that we get out of just a few simple rules. In this case, the birdoids have three rules. They steer to avoid crowding, they steer towards the average heading of the flock, and they steer towards the center of mass of their local flockmates. The end result are these intricate patterns that weren't programmed directly, but just emerge naturally. But finally, that brings us to an old algorithm that blew my mind just the other day and inspired this video, Boyer Moore string search. It's weird because it becomes faster and more efficient as the string it's searching becomes bigger. That seems impossible, but it makes sense when you understand the algorithm. It scans skin's text from right to left, then has two rules. When it encounters a bad character not found in the search pattern, it jumps past it based on an estimation made in a preprocess table. Scale. Likewise, if it finds a partial match, then a mismatch occurs. It has a separate pre calculated table that maximizes the number of characters it can safely skip. These rules are called heuristics, which are like functions that are not guaranteed to be perfect, but are far more practical than looping over every single character. In this case, the algorithm gets faster with more text because it's able to skip a higher proportion of characters. And if you've ever wondered why GREP is so fast, you have this algorithm to thank.\""
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "transcribe_json_list[0][\"results\"][\"channels\"][0][\"alternatives\"][0][\"transcript\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'success'"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "transcribe_json_list[0][\"results\"][\"summary\"][\"result\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'The speakers discuss the use of algorithms in scientific research, including random random algorithms like BOGO sort and BOGO sort to solve problems in scientific research, and the potential uses of these algorithms in optimizing algorithms and algorithms for algorithms. They also touch on the use of quantum algorithms in machine design and the future of digital security, including the use of random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random'"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "transcribe_json_list[0][\"results\"][\"summary\"][\"short\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "final_dataset = []\n",
    "# for video_link in tqdm(video_links, desc='Processing videos', unit='video'):\n",
    "for transcribe_json in tqdm(transcribe_json_list,desc='Processing transcribe'):\n",
    "    transcribe = transcribe_json[\"results\"][\"channels\"][0][\"alternatives\"][0][\"transcript\"]\n",
    "    if transcribe_json[\"results\"][\"summary\"][\"result\"]==\"success\":\n",
    "        summary = transcribe_json[\"results\"][\"summary\"][\"short\"]\n",
    "    final_json = {\n",
    "        \"transcribe\": transcribe,\n",
    "        \"summary\": summary\n",
    "    }\n",
    "    final_dataset.append(final_json)\n",
    "\n",
    "with open(\"transcribe_data_final_processed.json\", \"w\") as output:\n",
    "    json.dump(final_dataset, output)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "transcribe_json_list[0][\"channels\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_dataset_transcribe = load_dataset(\"json\",data_files=\"./transcribe_data_final.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_dataset_transcribe.push_to_hub(\"Huggingface-userId/FS_transcribe_summary\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_dataset_transcribe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_dataset_transcribe[\"train\"][1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "with open(\"./video_data_and_transcripts.json\") as F:\n",
    "    json_data = json.load(F)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(json_data)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
